library(tidyverse)
## ─ Attaching packages ───────────────── tidyverse 1.2.1 ─
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.3.4 ✔ dplyr 0.7.4
## ✔ tidyr 0.7.2 ✔ stringr 1.2.0
## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ─ Conflicts ─────────────────── tidyverse_conflicts() ─
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
Explore the distribution of each of the x, y, and z variables in diamonds. What do you learn? Think about a diamond and how you might decide which dimension is the length, width, and depth.
ggpairs(diamonds %>% select(x,y,z))
ダイアモンドは円形なので一番相関の大きいx,yがlength,widthでしょう.
Explore the distribution of price. Do you discover anything unusual or surprising? (Hint: Carefully think about the binwidth and make sure you try a wide range of values.)
grid.arrange(
ggplot(diamonds,aes(x=price))+
geom_histogram(binwidth=50),
ggplot(diamonds,aes(x=price))+
geom_histogram(binwidth=50)+
coord_cartesian(xlim=c(1000,2000))
)
価格1500にデータが存在しない!不思議!
How many diamonds are 0.99 carat? How many are 1 carat? What do you think is the cause of the difference?
print(dim(diamonds %>% filter(carat==0.99)))
## [1] 23 10
print(dim(diamonds %>% filter(carat==1)))
## [1] 1558 10
ちゃんとキリのいいcaratで作りたいんちゃう.
Compare and contrast coord_cartesian() vs xlim() or ylim() when zooming in on a histogram. What happens if you leave binwidth unset? What happens if you try and zoom so only half a bar shows?
xlim() vs coor_cartesian
grid.arrange(
ggplot(diamonds,aes(x=price))+
geom_histogram(binwidth=50)+
coord_cartesian(xlim=c(1000,2000)),
ggplot(diamonds,aes(x=price))+
geom_histogram(binwidth=50)+
xlim(c(1000,2000))
)
## Warning: Removed 44232 rows containing non-finite values (stat_bin).
ylim() vs coor_cartesian
grid.arrange(
ggplot(diamonds,aes(x=price))+
geom_histogram(),
ggplot(diamonds,aes(x=price))+
geom_histogram()+
coord_cartesian(ylim=c(0,100)),
ggplot(diamonds,aes(x=price))+
geom_histogram()+
ylim(c(0,100))
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 29 rows containing missing values (geom_bar).
この結果はかなり謎…一つ目,二つ目は妥当,なぜylimで消える??
What happens to missing values in a histogram? What happens to missing values in a bar chart? Why is there a difference?
まず,各列の欠測の数え方
apply(X=is.na(diamonds),MARGIN=2,sum)
## carat cut color clarity depth table price x y
## 0 0 0 0 0 0 0 0 0
## z
## 0
library(nycflights13)
apply(X=is.na(flights),MARGIN=2,sum)
## year month day dep_time sched_dep_time
## 0 0 0 8255 0
## dep_delay arr_time sched_arr_time arr_delay carrier
## 8255 8713 0 9430 0
## flight tailnum origin dest air_time
## 0 2512 0 0 9430
## distance hour minute time_hour
## 0 0 0 0
geom_bar用にmissingあり離散列が欲しいがない(tailnumはユニークidが多い)ので無理やり作る
flights2 <- flights %>%
mutate(delay_type = if_else(dep_delay>0,'late','fast'))
p1 <- ggplot(data=flights,aes(x=air_time))+geom_histogram()
p2 <- ggplot(data=flights2,aes(x=delay_type))+geom_bar()
grid.arrange(p1,p2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9430 rows containing non-finite values (stat_bin).
つまりbar plotはNAも個別のラベルとしてカウントする!!
What does
na.rm = TRUEdo inmean()andsum()?
flights %>%
group_by(dest) %>%
summarise(avg=mean(air_time))
## # A tibble: 105 x 2
## dest avg
## <chr> <dbl>
## 1 ABQ 249.1693
## 2 ACK NA
## 3 ALB NA
## 4 ANC 413.1250
## 5 ATL NA
## 6 AUS NA
## 7 AVL NA
## 8 BDL NA
## 9 BGR NA
## 10 BHM NA
## # ... with 95 more rows
flights %>%
group_by(dest) %>%
summarise(avg=mean(air_time,na.rm=TRUE))
## # A tibble: 105 x 2
## dest avg
## <chr> <dbl>
## 1 ABQ 249.16929
## 2 ACK 42.06818
## 3 ALB 31.78708
## 4 ANC 413.12500
## 5 ATL 112.93045
## 6 AUS 212.72791
## 7 AVL 89.88889
## 8 BDL 25.46602
## 9 BGR 54.11732
## 10 BHM 122.77695
## # ... with 95 more rows
NAが一つでもあると平均や合計の結果が全てNAになってしまうのでna.rm=TRUEで無視しましょう.
Use what you’ve learned to improve the visualisation of the departure times of cancelled vs. non-cancelled flights.
cancelled <- nycflights13::flights %>%
mutate(
cancelled = is.na(dep_time),
sched_hour = sched_dep_time %/% 100,
sched_min = sched_dep_time %% 100,
sched_dep_time = sched_hour + sched_min/60
) %>%
select(cancelled,sched_hour,sched_min,sched_dep_time)
cancelled
## # A tibble: 336,776 x 4
## cancelled sched_hour sched_min sched_dep_time
## <lgl> <dbl> <dbl> <dbl>
## 1 FALSE 5 15 5.250000
## 2 FALSE 5 29 5.483333
## 3 FALSE 5 40 5.666667
## 4 FALSE 5 45 5.750000
## 5 FALSE 6 0 6.000000
## 6 FALSE 5 58 5.966667
## 7 FALSE 6 0 6.000000
## 8 FALSE 6 0 6.000000
## 9 FALSE 6 0 6.000000
## 10 FALSE 6 0 6.000000
## # ... with 336,766 more rows
p1 <- ggplot(cancelled,aes(x=sched_dep_time))+geom_freqpoly(aes(color=cancelled),binwidth=1/4)
p2 <- ggplot(cancelled,aes(x=sched_dep_time,y=..density..))+geom_freqpoly(aes(color=cancelled),binwidth=1/4)
p3 <- ggplot(cancelled,aes(x=cancelled,y=sched_dep_time))+geom_boxplot()+coord_flip()
grid.arrange(p1,p2,p3)
What variable in the diamonds dataset is most important for predicting the price of a diamond? How is that variable correlated with cut? Why does the combination of those two relationships lead to lower quality diamonds being more expensive?
#
ここちょっとむずいので後
Install the ggstance package, and create a horizontal boxplot. How does this compare to using
coord_flip()?
library(ggstance)
##
## Attaching package: 'ggstance'
## The following objects are masked from 'package:ggplot2':
##
## geom_errorbarh, GeomErrorbarh
p1 <- ggplot(mpg,mapping=aes(x=hwy,y=class))+geom_boxploth()
p2 <- ggplot(diamonds,mapping=aes(x=price,y=color))+geom_boxploth()
p3 <- ggplot(diamonds,mapping=aes(x=price,y=cut,fill=color))+geom_boxploth()
grid.arrange(p1,p2,p3)
まじで,coord_flip()使えばいいと思う.x, yの指定が逆になるので注意.
One problem with boxplots is that they were developed in an era of much smaller datasets and tend to display a prohibitively large number of “outlying values”. One approach to remedy this problem is the letter value plot. Install the lvplot package, and try using geom_lv() to display the distribution of price vs cut. What do you learn? How do you interpret the plots?
library(lvplot)
grid.arrange(ggplot(diamonds)+geom_boxplot(aes(x=cut,y=price)),
ggplot(diamonds)+geom_lv(aes(x=cut,y=price,fill=..LV..))+
scale_fill_lv()
)
geom_lvちょっとよくわからんな...
Compare and contrast
geom_violin()with a facettedgeom_histogram(), or a colouredgeom_freqpoly(). What are the pros and cons of each method?
If you have a small dataset, it’s sometimes useful to use
geom_jitter()to see the relationship between a continuous and categorical variable. The ggbeeswarm package provides a number of methods similar togeom_jitter(). List them and briefly describe what each one does.
How could you rescale the count dataset above to more clearly show the distribution of cut within colour, or colour within cut?
scale_fill_gradient()をいじる.特に対数変換がいい気がする.
grid.arrange(
diamonds%>%
count(color,cut) %>%
ggplot(aes(x=color,y=cut))+
geom_tile(aes(fill=n),position = 'identity')+
scale_fill_gradient(low='white',high='steelblue')+
labs(title='w/o log scaling'),
diamonds%>%
count(color,cut) %>%
ggplot(aes(x=color,y=cut))+
geom_tile(aes(fill=n),position = 'identity')+
scale_fill_gradient(low='white',high='steelblue',trans='log')+
labs(title='with log scaling')
)
Use
geom_tile()together with dplyr to explore how average flight delays vary by destination and month of year. What makes the plot difficult to read? How could you improve it?
library(nycflights13)
flights %>%
group_by(dest,month) %>%
summarise(avdelay = mean(dep_delay)) %>%
ggplot(aes(x=dest,y=month))+
geom_tile(aes(fill=avdelay))+
coord_flip()
Why is it slightly better to use
aes(x = color, y = cut)rather thanaes(x = cut, y = color)in the example above?
grid.arrange(
diamonds%>%
count(color,cut) %>%
ggplot(aes(x=color,y=cut))+
geom_tile(aes(fill=n))+
scale_fill_gradient(trans='log'),
diamonds%>%
count(color,cut) %>%
ggplot(aes(x=cut,y=color))+
geom_tile(aes(fill=n))+
scale_fill_gradient(trans='log')
)
なんだろう...
くらいじゃないかな?
Instead of summarising the conditional distribution with a boxplot, you could use a frequency polygon. What do you need to consider when using
cut_width()vscut_number()? How does that impact a visualisation of the 2d distribution of carat and price?
ggplot(diamonds,mapping=aes(x=carat,y=price))+
geom_boxplot(aes(group=cut_number(carat,20)))
ggplot(diamonds,mapping=aes(x=price))+
geom_freqpoly(aes(color=cut_number(carat,10)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(diamonds,mapping=aes(x=price))+
geom_freqpoly(aes(color=cut_width(carat,0.5)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Visualise the distribution of carat, partitioned by price.
ggplot(diamonds,aes(x=price,y=carat))+
geom_bin2d()+
scale_fill_gradient2(trans='log')
How does the price distribution of very large diamonds compare to small diamonds. Is it as you expect, or does it surprise you?
diamonds4 <- diamonds %>% mutate(smaller = carat<3)%>%
select(smaller,everything())
ggplot(data = diamonds4,aes(x=carat,y=price))+
geom_point(aes(color=smaller),alpha=0.5)+
geom_smooth(aes(linetype=smaller),se=FALSE)
## `geom_smooth()` using method = 'gam'
Combine two of the techniques you’ve learned to visualise the combined distribution of cut, carat, and price.
ggplot(data=diamonds)+
geom_point(aes(x=carat,y=price),alpha=1/50)+
facet_wrap(~cut,nrow=2)
ggplot(data=diamonds)+
geom_hex(aes(x=carat,y=price))+
facet_wrap(~cut,nrow=2) # hexとか使ったほうが軽くていいね
ggplot(data = diamonds,aes(x=carat,y=price))+
geom_point(aes(color=cut),alpha=0.1)+
geom_smooth(aes(color=cut),se=FALSE)
## `geom_smooth()` using method = 'gam'
# caratが同じならやはりcutが悪いほうが安い
#下だと交絡にやられてるな
ggplot(data=diamonds,aes(x=cut,y=price))+
geom_boxplot()
Two dimensional plots reveal outliers that are not visible in one dimensional plots. For example, some points in the plot below have an unusual combination of x and y values, which makes the points outliers even though their x and y values appear normal when examined separately.
ggpairs(diamonds%>%select(x,y))
2d plotだとある外れ値が各軸に射影するとなくなるね.